MSB305 Final project: Predicting Grocery Store Daily Sales Per Product Family¶

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
In [3]:
train1 = pd.read_csv("train.csv", index_col=0, keep_date_col=True)
In [4]:
test1 = pd.read_csv("test.csv", index_col=0)
In [5]:
df0 = pd.concat([train1, test1])
df0
Out[5]:
date store_nbr family sales onpromotion
id
0 2013-01-01 1 AUTOMOTIVE 0.0 0
1 2013-01-01 1 BABY CARE 0.0 0
2 2013-01-01 1 BEAUTY 0.0 0
3 2013-01-01 1 BEVERAGES 0.0 0
4 2013-01-01 1 BOOKS 0.0 0
... ... ... ... ... ...
3029395 2017-08-31 9 POULTRY NaN 1
3029396 2017-08-31 9 PREPARED FOODS NaN 0
3029397 2017-08-31 9 PRODUCE NaN 1
3029398 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES NaN 9
3029399 2017-08-31 9 SEAFOOD NaN 0

3029400 rows × 5 columns

Part I: Data Integration¶

integrate oil price dataset:¶

In [6]:
gas = pd.read_csv("oil.csv", names=['date','gas_price'], header=0)
In [7]:
date_range = pd.period_range(start="2013-01-01", end="2017-08-31")
In [8]:
date_range1 = date_range.astype(str, copy=False)
In [9]:
calendar = pd.DataFrame(date_range1, columns= ["date"])
calendar.dtypes
Out[9]:
date    object
dtype: object
In [10]:
oil = pd.merge(left=calendar, right = gas, how = "left", on="date")
In [11]:
oil
Out[11]:
date gas_price
0 2013-01-01 NaN
1 2013-01-02 93.14
2 2013-01-03 92.97
3 2013-01-04 93.12
4 2013-01-05 NaN
... ... ...
1699 2017-08-27 NaN
1700 2017-08-28 46.40
1701 2017-08-29 46.46
1702 2017-08-30 45.96
1703 2017-08-31 47.26

1704 rows × 2 columns

In [12]:
interpolated_oil = oil.interpolate(method="linear")
In [13]:
interpolated_oil.fillna(93.140000, inplace=True)
In [14]:
interpolated_oil
Out[14]:
date gas_price
0 2013-01-01 93.140000
1 2013-01-02 93.140000
2 2013-01-03 92.970000
3 2013-01-04 93.120000
4 2013-01-05 93.146667
... ... ...
1699 2017-08-27 46.816667
1700 2017-08-28 46.400000
1701 2017-08-29 46.460000
1702 2017-08-30 45.960000
1703 2017-08-31 47.260000

1704 rows × 2 columns

In [15]:
df1 = pd.merge(left=df0, right=interpolated_oil, how="left", on="date")
df1
Out[15]:
date store_nbr family sales onpromotion gas_price
0 2013-01-01 1 AUTOMOTIVE 0.0 0 93.14
1 2013-01-01 1 BABY CARE 0.0 0 93.14
2 2013-01-01 1 BEAUTY 0.0 0 93.14
3 2013-01-01 1 BEVERAGES 0.0 0 93.14
4 2013-01-01 1 BOOKS 0.0 0 93.14
... ... ... ... ... ... ...
3029395 2017-08-31 9 POULTRY NaN 1 47.26
3029396 2017-08-31 9 PREPARED FOODS NaN 0 47.26
3029397 2017-08-31 9 PRODUCE NaN 1 47.26
3029398 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES NaN 9 47.26
3029399 2017-08-31 9 SEAFOOD NaN 0 47.26

3029400 rows × 6 columns

Integrate Stores dataset:¶

In [16]:
stores = pd.read_csv("stores.csv")
stores
Out[16]:
store_nbr city state type cluster
0 1 Quito Pichincha D 13
1 2 Quito Pichincha D 13
2 3 Quito Pichincha D 8
3 4 Quito Pichincha D 9
4 5 Santo Domingo Santo Domingo de los Tsachilas D 4
5 6 Quito Pichincha D 13
6 7 Quito Pichincha D 8
7 8 Quito Pichincha D 8
8 9 Quito Pichincha B 6
9 10 Quito Pichincha C 15
10 11 Cayambe Pichincha B 6
11 12 Latacunga Cotopaxi C 15
12 13 Latacunga Cotopaxi C 15
13 14 Riobamba Chimborazo C 7
14 15 Ibarra Imbabura C 15
15 16 Santo Domingo Santo Domingo de los Tsachilas C 3
16 17 Quito Pichincha C 12
17 18 Quito Pichincha B 16
18 19 Guaranda Bolivar C 15
19 20 Quito Pichincha B 6
20 21 Santo Domingo Santo Domingo de los Tsachilas B 6
21 22 Puyo Pastaza C 7
22 23 Ambato Tungurahua D 9
23 24 Guayaquil Guayas D 1
24 25 Salinas Santa Elena D 1
25 26 Guayaquil Guayas D 10
26 27 Daule Guayas D 1
27 28 Guayaquil Guayas E 10
28 29 Guayaquil Guayas E 10
29 30 Guayaquil Guayas C 3
30 31 Babahoyo Los Rios B 10
31 32 Guayaquil Guayas C 3
32 33 Quevedo Los Rios C 3
33 34 Guayaquil Guayas B 6
34 35 Playas Guayas C 3
35 36 Libertad Guayas E 10
36 37 Cuenca Azuay D 2
37 38 Loja Loja D 4
38 39 Cuenca Azuay B 6
39 40 Machala El Oro C 3
40 41 Machala El Oro D 4
41 42 Cuenca Azuay D 2
42 43 Esmeraldas Esmeraldas E 10
43 44 Quito Pichincha A 5
44 45 Quito Pichincha A 11
45 46 Quito Pichincha A 14
46 47 Quito Pichincha A 14
47 48 Quito Pichincha A 14
48 49 Quito Pichincha A 11
49 50 Ambato Tungurahua A 14
50 51 Guayaquil Guayas A 17
51 52 Manta Manabi A 11
52 53 Manta Manabi D 13
53 54 El Carmen Manabi C 3
In [17]:
df2 = pd.merge(left=df1, right=stores, how="left", on="store_nbr")
df2
Out[17]:
date store_nbr family sales onpromotion gas_price city state type cluster
0 2013-01-01 1 AUTOMOTIVE 0.0 0 93.14 Quito Pichincha D 13
1 2013-01-01 1 BABY CARE 0.0 0 93.14 Quito Pichincha D 13
2 2013-01-01 1 BEAUTY 0.0 0 93.14 Quito Pichincha D 13
3 2013-01-01 1 BEVERAGES 0.0 0 93.14 Quito Pichincha D 13
4 2013-01-01 1 BOOKS 0.0 0 93.14 Quito Pichincha D 13
... ... ... ... ... ... ... ... ... ... ...
3029395 2017-08-31 9 POULTRY NaN 1 47.26 Quito Pichincha B 6
3029396 2017-08-31 9 PREPARED FOODS NaN 0 47.26 Quito Pichincha B 6
3029397 2017-08-31 9 PRODUCE NaN 1 47.26 Quito Pichincha B 6
3029398 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES NaN 9 47.26 Quito Pichincha B 6
3029399 2017-08-31 9 SEAFOOD NaN 0 47.26 Quito Pichincha B 6

3029400 rows × 10 columns

integrate holiday dataset:¶

In [18]:
holiday = pd.read_csv("holidays_events.csv")
In [19]:
holiday
Out[19]:
date type locale locale_name description transferred
0 2012-03-02 Holiday Local Manta Fundacion de Manta False
1 2012-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi False
2 2012-04-12 Holiday Local Cuenca Fundacion de Cuenca False
3 2012-04-14 Holiday Local Libertad Cantonizacion de Libertad False
4 2012-04-21 Holiday Local Riobamba Cantonizacion de Riobamba False
... ... ... ... ... ... ...
345 2017-12-22 Additional National Ecuador Navidad-3 False
346 2017-12-23 Additional National Ecuador Navidad-2 False
347 2017-12-24 Additional National Ecuador Navidad-1 False
348 2017-12-25 Holiday National Ecuador Navidad False
349 2017-12-26 Additional National Ecuador Navidad+1 False

350 rows × 6 columns

In [20]:
holiday_national=holiday.loc[holiday['locale_name'] == "Ecuador" , :]
holiday_national
Out[20]:
date type locale locale_name description transferred
14 2012-08-10 Holiday National Ecuador Primer Grito de Independencia False
19 2012-10-09 Holiday National Ecuador Independencia de Guayaquil True
20 2012-10-12 Transfer National Ecuador Traslado Independencia de Guayaquil False
21 2012-11-02 Holiday National Ecuador Dia de Difuntos False
22 2012-11-03 Holiday National Ecuador Independencia de Cuenca False
... ... ... ... ... ... ...
345 2017-12-22 Additional National Ecuador Navidad-3 False
346 2017-12-23 Additional National Ecuador Navidad-2 False
347 2017-12-24 Additional National Ecuador Navidad-1 False
348 2017-12-25 Holiday National Ecuador Navidad False
349 2017-12-26 Additional National Ecuador Navidad+1 False

174 rows × 6 columns

In [21]:
locale_name = set(holiday.locale_name)
holiday.locale_name.value_counts()
Out[21]:
Ecuador                           174
Quito                              13
Riobamba                           12
Guaranda                           12
Latacunga                          12
Ambato                             12
Guayaquil                          11
Cuenca                              7
Ibarra                              7
Salinas                             6
Loja                                6
Santa Elena                         6
Santo Domingo de los Tsachilas      6
Quevedo                             6
Manta                               6
Esmeraldas                          6
Cotopaxi                            6
El Carmen                           6
Santo Domingo                       6
Machala                             6
Imbabura                            6
Puyo                                6
Libertad                            6
Cayambe                             6
Name: locale_name, dtype: int64
In [22]:
city_name = set(df2.city)
df2.city.value_counts()
Out[22]:
Quito            1009800
Guayaquil         448800
Santo Domingo     168300
Cuenca            168300
Latacunga         112200
Manta             112200
Machala           112200
Ambato            112200
Quevedo            56100
Esmeraldas         56100
Loja               56100
Libertad           56100
Playas             56100
Daule              56100
Babahoyo           56100
Cayambe            56100
Salinas            56100
Puyo               56100
Guaranda           56100
Ibarra             56100
Riobamba           56100
El Carmen          56100
Name: city, dtype: int64
In [23]:
city_name & locale_name
Out[23]:
{'Ambato',
 'Cayambe',
 'Cuenca',
 'El Carmen',
 'Esmeraldas',
 'Guaranda',
 'Guayaquil',
 'Ibarra',
 'Latacunga',
 'Libertad',
 'Loja',
 'Machala',
 'Manta',
 'Puyo',
 'Quevedo',
 'Quito',
 'Riobamba',
 'Salinas',
 'Santo Domingo'}
In [24]:
df2.state.value_counts()
Out[24]:
Pichincha                         1065900
Guayas                             617100
Santo Domingo de los Tsachilas     168300
Azuay                              168300
Manabi                             168300
Cotopaxi                           112200
Tungurahua                         112200
Los Rios                           112200
El Oro                             112200
Chimborazo                          56100
Imbabura                            56100
Bolivar                             56100
Pastaza                             56100
Santa Elena                         56100
Loja                                56100
Esmeraldas                          56100
Name: state, dtype: int64
In [25]:
set(df2.state) & locale_name
Out[25]:
{'Cotopaxi',
 'Esmeraldas',
 'Imbabura',
 'Loja',
 'Santa Elena',
 'Santo Domingo de los Tsachilas'}
In [26]:
holiday
Out[26]:
date type locale locale_name description transferred
0 2012-03-02 Holiday Local Manta Fundacion de Manta False
1 2012-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi False
2 2012-04-12 Holiday Local Cuenca Fundacion de Cuenca False
3 2012-04-14 Holiday Local Libertad Cantonizacion de Libertad False
4 2012-04-21 Holiday Local Riobamba Cantonizacion de Riobamba False
... ... ... ... ... ... ...
345 2017-12-22 Additional National Ecuador Navidad-3 False
346 2017-12-23 Additional National Ecuador Navidad-2 False
347 2017-12-24 Additional National Ecuador Navidad-1 False
348 2017-12-25 Holiday National Ecuador Navidad False
349 2017-12-26 Additional National Ecuador Navidad+1 False

350 rows × 6 columns

In [27]:
# add national holidays:
In [28]:
df3_national = pd.merge(left=df2, right=holiday_national[["date","locale"]], how="left", on="date")
df3_national
Out[28]:
date store_nbr family sales onpromotion gas_price city state type cluster locale
0 2013-01-01 1 AUTOMOTIVE 0.0 0 93.14 Quito Pichincha D 13 National
1 2013-01-01 1 BABY CARE 0.0 0 93.14 Quito Pichincha D 13 National
2 2013-01-01 1 BEAUTY 0.0 0 93.14 Quito Pichincha D 13 National
3 2013-01-01 1 BEVERAGES 0.0 0 93.14 Quito Pichincha D 13 National
4 2013-01-01 1 BOOKS 0.0 0 93.14 Quito Pichincha D 13 National
... ... ... ... ... ... ... ... ... ... ... ...
3036523 2017-08-31 9 POULTRY NaN 1 47.26 Quito Pichincha B 6 NaN
3036524 2017-08-31 9 PREPARED FOODS NaN 0 47.26 Quito Pichincha B 6 NaN
3036525 2017-08-31 9 PRODUCE NaN 1 47.26 Quito Pichincha B 6 NaN
3036526 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES NaN 9 47.26 Quito Pichincha B 6 NaN
3036527 2017-08-31 9 SEAFOOD NaN 0 47.26 Quito Pichincha B 6 NaN

3036528 rows × 11 columns

In [29]:
df3_national['locale'] = df3_national['locale'].fillna(0)
df3_national['national_holiday'] = df3_national['locale'].map(lambda x: 'No' if x==0 else 'Yes')
df3_national
Out[29]:
date store_nbr family sales onpromotion gas_price city state type cluster locale national_holiday
0 2013-01-01 1 AUTOMOTIVE 0.0 0 93.14 Quito Pichincha D 13 National Yes
1 2013-01-01 1 BABY CARE 0.0 0 93.14 Quito Pichincha D 13 National Yes
2 2013-01-01 1 BEAUTY 0.0 0 93.14 Quito Pichincha D 13 National Yes
3 2013-01-01 1 BEVERAGES 0.0 0 93.14 Quito Pichincha D 13 National Yes
4 2013-01-01 1 BOOKS 0.0 0 93.14 Quito Pichincha D 13 National Yes
... ... ... ... ... ... ... ... ... ... ... ... ...
3036523 2017-08-31 9 POULTRY NaN 1 47.26 Quito Pichincha B 6 0 No
3036524 2017-08-31 9 PREPARED FOODS NaN 0 47.26 Quito Pichincha B 6 0 No
3036525 2017-08-31 9 PRODUCE NaN 1 47.26 Quito Pichincha B 6 0 No
3036526 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES NaN 9 47.26 Quito Pichincha B 6 0 No
3036527 2017-08-31 9 SEAFOOD NaN 0 47.26 Quito Pichincha B 6 0 No

3036528 rows × 12 columns

In [30]:
# add regional holidays:
In [31]:
holiday_regional=holiday.loc[holiday['locale'] == "Regional" , :]
holiday_regional
Out[31]:
date type locale locale_name description transferred
1 2012-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi False
7 2012-06-25 Holiday Regional Imbabura Provincializacion de Imbabura False
23 2012-11-06 Holiday Regional Santo Domingo de los Tsachilas Provincializacion de Santo Domingo False
24 2012-11-07 Holiday Regional Santa Elena Provincializacion Santa Elena False
47 2013-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi False
58 2013-06-25 Holiday Regional Imbabura Provincializacion de Imbabura False
76 2013-11-06 Holiday Regional Santo Domingo de los Tsachilas Provincializacion de Santo Domingo False
77 2013-11-07 Holiday Regional Santa Elena Provincializacion Santa Elena False
96 2014-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi False
112 2014-06-25 Holiday Regional Imbabura Provincializacion de Imbabura False
139 2014-11-06 Holiday Regional Santo Domingo de los Tsachilas Provincializacion de Santo Domingo False
140 2014-11-07 Holiday Regional Santa Elena Provincializacion Santa Elena False
165 2015-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi False
177 2015-06-25 Holiday Regional Imbabura Provincializacion de Imbabura False
193 2015-11-06 Holiday Regional Santo Domingo de los Tsachilas Provincializacion de Santo Domingo False
194 2015-11-07 Holiday Regional Santa Elena Provincializacion Santa Elena False
216 2016-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi False
259 2016-06-25 Holiday Regional Imbabura Provincializacion de Imbabura False
278 2016-11-06 Holiday Regional Santo Domingo de los Tsachilas Provincializacion de Santo Domingo False
279 2016-11-07 Holiday Regional Santa Elena Provincializacion Santa Elena False
302 2017-04-01 Holiday Regional Cotopaxi Provincializacion de Cotopaxi False
315 2017-06-25 Holiday Regional Imbabura Provincializacion de Imbabura False
334 2017-11-06 Holiday Regional Santo Domingo de los Tsachilas Provincializacion de Santo Domingo False
335 2017-11-07 Holiday Regional Santa Elena Provincializacion Santa Elena False
In [32]:
df3_regional = pd.merge(left=df3_national, right=holiday_regional[["date","locale_name"]], how="left", 
                        left_on=['date','state'],
                       right_on=['date','locale_name'])
df3_regional
Out[32]:
date store_nbr family sales onpromotion gas_price city state type cluster locale national_holiday locale_name
0 2013-01-01 1 AUTOMOTIVE 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN
1 2013-01-01 1 BABY CARE 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN
2 2013-01-01 1 BEAUTY 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN
3 2013-01-01 1 BEVERAGES 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN
4 2013-01-01 1 BOOKS 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
3036523 2017-08-31 9 POULTRY NaN 1 47.26 Quito Pichincha B 6 0 No NaN
3036524 2017-08-31 9 PREPARED FOODS NaN 0 47.26 Quito Pichincha B 6 0 No NaN
3036525 2017-08-31 9 PRODUCE NaN 1 47.26 Quito Pichincha B 6 0 No NaN
3036526 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES NaN 9 47.26 Quito Pichincha B 6 0 No NaN
3036527 2017-08-31 9 SEAFOOD NaN 0 47.26 Quito Pichincha B 6 0 No NaN

3036528 rows × 13 columns

In [33]:
df3_regional.locale_name.value_counts()
Out[33]:
Santo Domingo de los Tsachilas    396
Cotopaxi                          330
Imbabura                          165
Santa Elena                       132
Name: locale_name, dtype: int64
In [34]:
df3_regional["regional_holiday"] = df3_regional.locale_name.fillna(0).map(lambda x: 'No' if x==0 else 'Yes')
In [35]:
df3_regional
Out[35]:
date store_nbr family sales onpromotion gas_price city state type cluster locale national_holiday locale_name regional_holiday
0 2013-01-01 1 AUTOMOTIVE 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No
1 2013-01-01 1 BABY CARE 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No
2 2013-01-01 1 BEAUTY 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No
3 2013-01-01 1 BEVERAGES 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No
4 2013-01-01 1 BOOKS 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3036523 2017-08-31 9 POULTRY NaN 1 47.26 Quito Pichincha B 6 0 No NaN No
3036524 2017-08-31 9 PREPARED FOODS NaN 0 47.26 Quito Pichincha B 6 0 No NaN No
3036525 2017-08-31 9 PRODUCE NaN 1 47.26 Quito Pichincha B 6 0 No NaN No
3036526 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES NaN 9 47.26 Quito Pichincha B 6 0 No NaN No
3036527 2017-08-31 9 SEAFOOD NaN 0 47.26 Quito Pichincha B 6 0 No NaN No

3036528 rows × 14 columns

In [36]:
# add local holidays:
In [37]:
holiday_local=holiday.loc[holiday['locale'] == "Local" , :]
holiday_local
Out[37]:
date type locale locale_name description transferred
0 2012-03-02 Holiday Local Manta Fundacion de Manta False
2 2012-04-12 Holiday Local Cuenca Fundacion de Cuenca False
3 2012-04-14 Holiday Local Libertad Cantonizacion de Libertad False
4 2012-04-21 Holiday Local Riobamba Cantonizacion de Riobamba False
5 2012-05-12 Holiday Local Puyo Cantonizacion del Puyo False
... ... ... ... ... ... ...
339 2017-12-05 Additional Local Quito Fundacion de Quito-1 False
340 2017-12-06 Holiday Local Quito Fundacion de Quito True
341 2017-12-08 Holiday Local Loja Fundacion de Loja False
342 2017-12-08 Transfer Local Quito Traslado Fundacion de Quito False
344 2017-12-22 Holiday Local Salinas Cantonizacion de Salinas False

152 rows × 6 columns

In [38]:
df3_local = pd.merge(left=df3_regional, right=holiday_local[["date","locale_name"]], how="left", 
                        left_on=['date','city'],
                       right_on=['date','locale_name'])
df3_local
Out[38]:
date store_nbr family sales onpromotion gas_price city state type cluster locale national_holiday locale_name_x regional_holiday locale_name_y
0 2013-01-01 1 AUTOMOTIVE 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No NaN
1 2013-01-01 1 BABY CARE 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No NaN
2 2013-01-01 1 BEAUTY 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No NaN
3 2013-01-01 1 BEVERAGES 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No NaN
4 2013-01-01 1 BOOKS 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3036787 2017-08-31 9 POULTRY NaN 1 47.26 Quito Pichincha B 6 0 No NaN No NaN
3036788 2017-08-31 9 PREPARED FOODS NaN 0 47.26 Quito Pichincha B 6 0 No NaN No NaN
3036789 2017-08-31 9 PRODUCE NaN 1 47.26 Quito Pichincha B 6 0 No NaN No NaN
3036790 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES NaN 9 47.26 Quito Pichincha B 6 0 No NaN No NaN
3036791 2017-08-31 9 SEAFOOD NaN 0 47.26 Quito Pichincha B 6 0 No NaN No NaN

3036792 rows × 15 columns

In [39]:
df3_local['local_holiday'] = df3_local['locale_name_y'].fillna(0).map(lambda x: 'No' if x==0 else 'Yes')
df3_local
Out[39]:
date store_nbr family sales onpromotion gas_price city state type cluster locale national_holiday locale_name_x regional_holiday locale_name_y local_holiday
0 2013-01-01 1 AUTOMOTIVE 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No NaN No
1 2013-01-01 1 BABY CARE 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No NaN No
2 2013-01-01 1 BEAUTY 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No NaN No
3 2013-01-01 1 BEVERAGES 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No NaN No
4 2013-01-01 1 BOOKS 0.0 0 93.14 Quito Pichincha D 13 National Yes NaN No NaN No
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3036787 2017-08-31 9 POULTRY NaN 1 47.26 Quito Pichincha B 6 0 No NaN No NaN No
3036788 2017-08-31 9 PREPARED FOODS NaN 0 47.26 Quito Pichincha B 6 0 No NaN No NaN No
3036789 2017-08-31 9 PRODUCE NaN 1 47.26 Quito Pichincha B 6 0 No NaN No NaN No
3036790 2017-08-31 9 SCHOOL AND OFFICE SUPPLIES NaN 9 47.26 Quito Pichincha B 6 0 No NaN No NaN No
3036791 2017-08-31 9 SEAFOOD NaN 0 47.26 Quito Pichincha B 6 0 No NaN No NaN No

3036792 rows × 16 columns

In [40]:
df3_local['national_holiday'].value_counts()
Out[40]:
No     2774838
Yes     261954
Name: national_holiday, dtype: int64
In [41]:
df3_local['regional_holiday'].value_counts()
Out[41]:
No     3035769
Yes       1023
Name: regional_holiday, dtype: int64
In [42]:
df3_local['local_holiday'].value_counts()
Out[42]:
No     3024219
Yes      12573
Name: local_holiday, dtype: int64
In [43]:
df3 = df3_local[['date','store_nbr','type','cluster','family',
                 'sales','onpromotion','gas_price',
                 'national_holiday', 'regional_holiday', 'local_holiday']]
In [44]:
column_name = ['date','store_number','store_type','store_cluster','product_family',
                 'sales_volume','items_on_promotion','gas_price',
                 'national_holiday', 'regional_holiday', 'local_holiday']
In [45]:
df3.columns = column_name
In [46]:
df3['date'] = pd.to_datetime(df3['date'], format = '%Y-%m-%d')
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/1179531089.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['date'] = pd.to_datetime(df3['date'], format = '%Y-%m-%d')
In [47]:
data = df3.drop_duplicates(keep='first')
In [47]:
# add some date columns: year, month, day_of_week
In [65]:
data['year']=data['date'].dt.year.astype('category')
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/3995476843.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['year']=data['date'].dt.year.astype('category')
In [69]:
data['month']=data['date'].dt.month_name().astype('category')
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/328802076.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['month']=data['date'].dt.month_name().astype('category')
In [60]:
data['day_of_week'] = data['date'].dt.day_name().astype('category')
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/405239943.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['day_of_week'] = data['date'].dt.day_name().astype('category')
In [70]:
data.sample(50)
Out[70]:
date store_number store_type store_cluster product_family sales_volume items_on_promotion gas_price national_holiday regional_holiday local_holiday day_of_week year month
1115064 2014-09-19 45 A 11 PLAYERS AND ELECTRONICS 12.000000 0 92.430000 No No No Friday 2014 September
2656217 2017-01-30 30 C 3 HARDWARE 0.000000 0 52.630000 No No No Monday 2017 January
1765111 2015-09-19 35 C 3 CLEANING 561.000000 4 45.363333 No No No Saturday 2015 September
359617 2013-07-21 49 A 11 HOME AND KITCHEN II 0.000000 0 107.073333 No No No Sunday 2013 July
300118 2013-06-18 3 D 8 HOME AND KITCHEN II 0.000000 0 98.460000 No No No Tuesday 2013 June
310717 2013-06-24 27 D 1 LIQUOR,WINE,BEER 37.000000 0 95.070000 No No No Monday 2013 June
1968821 2016-01-12 50 A 14 DAIRY 849.000000 12 30.420000 No No No Tuesday 2016 January
2321873 2016-07-25 49 A 11 PET SUPPLIES 22.000000 0 42.400000 No No No Monday 2016 July
2373955 2016-08-24 11 B 6 BABY CARE 1.000000 0 46.290000 No No No Wednesday 2016 August
222450 2013-05-05 5 D 4 PRODUCE 14.000000 0 95.616667 No No No Sunday 2013 May
3003496 2017-08-13 25 D 1 BABY CARE 0.000000 0 47.996667 No No No Sunday 2017 August
423953 2013-08-26 54 C 3 BEAUTY 0.000000 0 105.880000 No No No Monday 2013 August
2892389 2017-06-11 8 D 8 BREAD/BAKERY 1101.784000 0 46.006667 No No No Sunday 2017 June
1888426 2015-11-27 45 A 11 BABY CARE 0.000000 0 40.570000 Yes No No Friday 2015 November
2571188 2016-12-12 44 A 5 PET SUPPLIES 25.000000 1 52.740000 No No No Monday 2016 December
1261070 2014-12-10 42 D 2 DAIRY 0.000000 0 60.990000 No No No Wednesday 2014 December
103271 2013-02-27 7 D 8 HARDWARE 3.000000 0 92.840000 No No No Wednesday 2013 February
1620868 2015-06-30 38 D 4 CLEANING 1049.000000 2 59.480000 No No No Tuesday 2015 June
1335575 2015-01-21 32 C 3 SEAFOOD 0.000000 0 47.850000 No No No Wednesday 2015 January
2646151 2017-01-24 48 A 14 GROCERY II 21.000000 0 52.380000 No No No Tuesday 2017 January
254365 2013-05-23 46 A 14 BABY CARE 0.000000 0 94.120000 No No No Thursday 2013 May
2631120 2017-01-16 26 D 10 PRODUCE 574.673000 4 52.427500 No No No Monday 2017 January
2336432 2016-08-02 8 D 8 SEAFOOD 49.267002 0 39.500000 No No No Tuesday 2016 August
2598389 2016-12-28 9 B 6 BEAUTY 14.000000 1 54.010000 No No No Wednesday 2016 December
1593304 2015-06-15 14 C 7 SCHOOL AND OFFICE SUPPLIES 0.000000 0 59.530000 No No No Monday 2015 June
1368961 2015-02-09 2 D 13 LIQUOR,WINE,BEER 63.000000 0 52.990000 No No No Monday 2015 February
1598559 2015-06-18 12 C 15 CELEBRATION 13.000000 0 60.410000 No No No Thursday 2015 June
2909726 2017-06-21 43 E 10 HOME APPLIANCES 0.000000 0 42.480000 No No No Wednesday 2017 June
3023649 2017-08-24 4 D 9 MEATS NaN 0 47.240000 No No No Thursday 2017 August
1284792 2014-12-23 9 B 6 BEVERAGES 2682.000000 4 56.780000 Yes No No Tuesday 2014 December
527198 2013-10-23 50 A 14 MAGAZINES 0.000000 0 96.900000 No No No Wednesday 2013 October
473580 2013-09-23 46 A 14 PRODUCE 5.000000 0 103.620000 No No No Monday 2013 September
848148 2014-04-22 7 D 8 HOME AND KITCHEN I 0.000000 0 101.690000 No No No Tuesday 2014 April
969796 2014-06-30 2 D 13 PERSONAL CARE 264.000000 0 106.070000 Yes No No Monday 2014 June
5601 2013-01-04 16 C 3 MEATS 13.625000 0 93.120000 No No No Friday 2013 January
2226219 2016-06-02 23 D 9 CELEBRATION 8.000000 0 49.140000 No No No Thursday 2016 June
1042489 2014-08-10 1 D 13 LADIESWEAR 0.000000 0 97.930000 Yes No No Sunday 2014 August
701211 2014-01-30 33 C 3 PLAYERS AND ELECTRONICS 3.000000 0 98.250000 No No No Thursday 2014 January
1156391 2014-10-12 6 D 13 BREAD/BAKERY 793.818000 1 85.776667 No No No Sunday 2014 October
1197895 2014-11-05 2 D 13 POULTRY 569.457000 0 78.710000 No No No Wednesday 2014 November
2404585 2016-09-10 20 B 6 CLEANING 1766.000000 35 46.013333 No No No Saturday 2016 September
497567 2013-10-07 2 D 13 PET SUPPLIES 0.000000 0 103.070000 No No No Monday 2013 October
405704 2013-08-16 42 D 2 BEAUTY 0.000000 0 107.580000 No No No Friday 2013 August
224970 2013-05-07 21 B 6 DELI 0.000000 0 95.280000 No No No Tuesday 2013 May
1988509 2016-01-23 52 A 11 POULTRY 0.000000 0 31.483333 No No No Saturday 2016 January
66152 2013-02-07 15 C 15 LAWN AND GARDEN 0.000000 0 95.840000 No No No Thursday 2013 February
2538739 2016-11-24 34 B 6 HOME AND KITCHEN II 24.000000 1 46.720000 No No No Thursday 2016 November
268872 2013-05-31 52 A 11 LINGERIE 0.000000 0 91.930000 No No No Friday 2013 May
812602 2014-04-03 1 D 13 EGGS 105.000000 0 100.290000 No No No Thursday 2014 April
1497485 2015-04-22 26 D 10 FROZEN FOODS 45.000000 2 56.170000 No No No Wednesday 2015 April
In [72]:
data.dtypes
Out[72]:
date                  datetime64[ns]
store_number                   int64
store_type                    object
store_cluster                  int64
product_family                object
sales_volume                 float64
items_on_promotion             int64
gas_price                    float64
national_holiday              object
regional_holiday              object
local_holiday                 object
day_of_week                 category
year                        category
month                       category
dtype: object
In [73]:
category_columns = ['store_number', 'store_type', 'store_cluster', 'product_family','year', 'month', 'day_of_week', 'national_holiday', 'regional_holiday','local_holiday']
data.loc[:, category_columns] = data.loc[:,category_columns].apply(lambda x: x.astype('category'))
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/2128232577.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[:, category_columns] = data.loc[:,category_columns].apply(lambda x: x.astype('category'))
In [74]:
data.dtypes
Out[74]:
date                  datetime64[ns]
store_number                category
store_type                  category
store_cluster               category
product_family              category
sales_volume                 float64
items_on_promotion             int64
gas_price                    float64
national_holiday            category
regional_holiday            category
local_holiday               category
day_of_week                 category
year                        category
month                       category
dtype: object

EDA¶

In [78]:
sns.heatmap(data.corr(),annot=True )
Out[78]:
<AxesSubplot:>
In [76]:
sns.scatterplot(data=data, x='items_on_promotion',y='sales_volume')
Out[76]:
<AxesSubplot:xlabel='items_on_promotion', ylabel='sales_volume'>
In [75]:
sns.scatterplot(data=data, x='gas_price',y='sales_volume')
Out[75]:
<AxesSubplot:xlabel='gas_price', ylabel='sales_volume'>

Part II: predictive analytics:¶

In [107]:
dataset = data.query('sales_volume!=0.0')
In [172]:
# train dataset:
train = dataset.loc[data['sales_volume'].notnull(),:]
X_train = train.drop(columns=['sales_volume','date','store_type','store_cluster'])
In [109]:
y_train = train['sales_volume']
In [110]:
# test dataset:
test = dataset.loc[data['sales_volume'].isnull(),:]
X_test = test.drop(columns=['sales_volume','date','store_type','store_cluster'])
In [82]:
from statsmodels.formula.api import ols
In [89]:
data.columns
Out[89]:
Index(['date', 'store_number', 'store_type', 'store_cluster', 'product_family',
       'sales_volume', 'items_on_promotion', 'gas_price', 'national_holiday',
       'regional_holiday', 'local_holiday', 'day_of_week', 'year', 'month'],
      dtype='object')
In [120]:
model = ols("np.log(sales_volume) ~ store_number + product_family + items_on_promotion + gas_price + national_holiday + regional_holiday + local_holiday + day_of_week + year + month", data=train).fit()
In [121]:
print(model.summary())
                             OLS Regression Results                             
================================================================================
Dep. Variable:     np.log(sales_volume)   R-squared:                       0.893
Model:                              OLS   Adj. R-squared:                  0.893
Method:                   Least Squares   F-statistic:                 1.549e+05
Date:                  Mon, 01 Aug 2022   Prob (F-statistic):               0.00
Time:                          09:58:31   Log-Likelihood:            -2.3772e+06
No. Observations:               2061758   AIC:                         4.755e+06
Df Residuals:                   2061646   BIC:                         4.756e+06
Df Model:                           111                                         
Covariance Type:              nonrobust                                         
================================================================================================================
                                                   coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------------
Intercept                                        1.3455      0.009    149.419      0.000       1.328       1.363
store_number[T.2]                                0.3367      0.005     64.526      0.000       0.326       0.347
store_number[T.3]                                1.1616      0.005    224.954      0.000       1.152       1.172
store_number[T.4]                                0.1665      0.005     31.795      0.000       0.156       0.177
store_number[T.5]                                0.1071      0.005     20.469      0.000       0.097       0.117
store_number[T.6]                                0.4671      0.005     89.202      0.000       0.457       0.477
store_number[T.7]                                0.5024      0.005     96.524      0.000       0.492       0.513
store_number[T.8]                                0.7540      0.005    145.263      0.000       0.744       0.764
store_number[T.9]                                0.3694      0.005     70.710      0.000       0.359       0.380
store_number[T.10]                              -0.7124      0.005   -130.977      0.000      -0.723      -0.702
store_number[T.11]                               0.2475      0.005     47.036      0.000       0.237       0.258
store_number[T.12]                              -0.4503      0.005    -82.550      0.000      -0.461      -0.440
store_number[T.13]                              -0.6395      0.005   -117.546      0.000      -0.650      -0.629
store_number[T.14]                              -0.4347      0.005    -79.430      0.000      -0.445      -0.424
store_number[T.15]                              -0.4272      0.005    -79.557      0.000      -0.438      -0.417
store_number[T.16]                              -0.4997      0.005    -91.332      0.000      -0.510      -0.489
store_number[T.17]                              -0.0539      0.005    -10.010      0.000      -0.064      -0.043
store_number[T.18]                              -0.1656      0.005    -30.605      0.000      -0.176      -0.155
store_number[T.19]                              -0.4186      0.005    -78.049      0.000      -0.429      -0.408
store_number[T.20]                              -0.0242      0.006     -3.970      0.000      -0.036      -0.012
store_number[T.21]                              -0.0821      0.006    -12.772      0.000      -0.095      -0.070
store_number[T.22]                              -0.5883      0.007    -85.176      0.000      -0.602      -0.575
store_number[T.23]                              -0.2774      0.005    -51.977      0.000      -0.288      -0.267
store_number[T.24]                               0.4624      0.005     87.303      0.000       0.452       0.473
store_number[T.25]                              -0.2982      0.005    -55.567      0.000      -0.309      -0.288
store_number[T.26]                              -0.7841      0.005   -145.462      0.000      -0.795      -0.774
store_number[T.27]                               0.1814      0.005     34.711      0.000       0.171       0.192
store_number[T.28]                              -0.0116      0.005     -2.184      0.029      -0.022      -0.001
store_number[T.29]                              -0.1235      0.006    -19.851      0.000      -0.136      -0.111
store_number[T.30]                              -0.7480      0.005   -137.083      0.000      -0.759      -0.737
store_number[T.31]                              -0.0883      0.005    -16.662      0.000      -0.099      -0.078
store_number[T.32]                              -1.0893      0.006   -194.729      0.000      -1.100      -1.078
store_number[T.33]                              -0.1955      0.005    -35.932      0.000      -0.206      -0.185
store_number[T.34]                              -0.0874      0.005    -16.578      0.000      -0.098      -0.077
store_number[T.35]                              -0.8988      0.006   -161.014      0.000      -0.910      -0.888
store_number[T.36]                              -0.2108      0.005    -38.512      0.000      -0.222      -0.200
store_number[T.37]                               0.2857      0.005     54.768      0.000       0.275       0.296
store_number[T.38]                               0.2261      0.005     43.276      0.000       0.216       0.236
store_number[T.39]                               0.1407      0.005     26.941      0.000       0.130       0.151
store_number[T.40]                              -0.1553      0.005    -28.772      0.000      -0.166      -0.145
store_number[T.41]                              -0.0083      0.005     -1.588      0.112      -0.019       0.002
store_number[T.42]                               0.1347      0.006     20.791      0.000       0.122       0.147
store_number[T.43]                              -0.2468      0.005    -45.158      0.000      -0.258      -0.236
store_number[T.44]                               1.3486      0.005    261.301      0.000       1.338       1.359
store_number[T.45]                               1.1438      0.005    220.695      0.000       1.134       1.154
store_number[T.46]                               0.8637      0.005    166.673      0.000       0.854       0.874
store_number[T.47]                               1.0795      0.005    208.540      0.000       1.069       1.090
store_number[T.48]                               0.6827      0.005    130.937      0.000       0.672       0.693
store_number[T.49]                               0.9463      0.005    182.597      0.000       0.936       0.956
store_number[T.50]                               0.5762      0.005    110.755      0.000       0.566       0.586
store_number[T.51]                               0.6281      0.005    120.917      0.000       0.618       0.638
store_number[T.52]                               0.6498      0.013     48.521      0.000       0.624       0.676
store_number[T.53]                              -0.2024      0.006    -35.533      0.000      -0.214      -0.191
store_number[T.54]                              -0.6133      0.005   -111.998      0.000      -0.624      -0.603
product_family[T.BABY CARE]                     -1.1534      0.011   -106.097      0.000      -1.175      -1.132
product_family[T.BEAUTY]                        -0.4876      0.004   -121.189      0.000      -0.496      -0.480
product_family[T.BEVERAGES]                      5.8800      0.004   1536.894      0.000       5.873       5.888
product_family[T.BOOKS]                         -1.6696      0.015   -112.276      0.000      -1.699      -1.640
product_family[T.BREAD/BAKERY]                   4.3549      0.004   1145.111      0.000       4.347       4.362
product_family[T.CELEBRATION]                    0.5985      0.004    137.251      0.000       0.590       0.607
product_family[T.CLEANING]                       5.2675      0.004   1381.249      0.000       5.260       5.275
product_family[T.DAIRY]                          4.6604      0.004   1221.097      0.000       4.653       4.668
product_family[T.DELI]                           3.8252      0.004   1003.864      0.000       3.818       3.833
product_family[T.EGGS]                           3.3146      0.004    872.086      0.000       3.307       3.322
product_family[T.FROZEN FOODS]                   2.9191      0.004    768.136      0.000       2.912       2.927
product_family[T.GROCERY I]                      6.4008      0.004   1634.661      0.000       6.393       6.409
product_family[T.GROCERY II]                     0.9919      0.004    258.169      0.000       0.984       0.999
product_family[T.HARDWARE]                      -1.1542      0.004   -259.242      0.000      -1.163      -1.146
product_family[T.HOME AND KITCHEN I]             1.3623      0.004    326.526      0.000       1.354       1.370
product_family[T.HOME AND KITCHEN II]            1.1153      0.004    267.231      0.000       1.107       1.123
product_family[T.HOME APPLIANCES]               -1.4861      0.006   -263.069      0.000      -1.497      -1.475
product_family[T.HOME CARE]                      3.8177      0.004    888.180      0.000       3.809       3.826
product_family[T.LADIESWEAR]                     0.5387      0.005    110.542      0.000       0.529       0.548
product_family[T.LAWN AND GARDEN]               -0.0471      0.004    -10.603      0.000      -0.056      -0.038
product_family[T.LINGERIE]                       0.1252      0.004     32.210      0.000       0.118       0.133
product_family[T.LIQUOR,WINE,BEER]               2.5024      0.004    639.785      0.000       2.495       2.510
product_family[T.MAGAZINES]                     -0.4514      0.005    -95.216      0.000      -0.461      -0.442
product_family[T.MEATS]                          3.9396      0.004   1036.065      0.000       3.932       3.947
product_family[T.PERSONAL CARE]                  3.8681      0.004   1017.526      0.000       3.861       3.876
product_family[T.PET SUPPLIES]                  -0.2070      0.005    -44.292      0.000      -0.216      -0.198
product_family[T.PLAYERS AND ELECTRONICS]        0.2599      0.004     59.200      0.000       0.251       0.269
product_family[T.POULTRY]                        3.8923      0.004   1023.929      0.000       3.885       3.900
product_family[T.PREPARED FOODS]                 2.6472      0.004    696.674      0.000       2.640       2.655
product_family[T.PRODUCE]                        4.3704      0.004   1062.705      0.000       4.362       4.378
product_family[T.SCHOOL AND OFFICE SUPPLIES]    -0.6847      0.006   -119.871      0.000      -0.696      -0.674
product_family[T.SEAFOOD]                        0.8532      0.004    217.187      0.000       0.846       0.861
national_holiday[T.Yes]                          0.1214      0.002     60.463      0.000       0.117       0.125
regional_holiday[T.Yes]                          0.0863      0.030      2.919      0.004       0.028       0.144
local_holiday[T.Yes]                            -0.0210      0.008     -2.539      0.011      -0.037      -0.005
day_of_week[T.Monday]                           -0.0105      0.002     -5.233      0.000      -0.014      -0.007
day_of_week[T.Saturday]                          0.3028      0.002    151.936      0.000       0.299       0.307
day_of_week[T.Sunday]                            0.2766      0.002    137.387      0.000       0.273       0.281
day_of_week[T.Thursday]                         -0.0990      0.002    -49.227      0.000      -0.103      -0.095
day_of_week[T.Tuesday]                          -0.0538      0.002    -26.855      0.000      -0.058      -0.050
day_of_week[T.Wednesday]                        -0.0673      0.002    -33.529      0.000      -0.071      -0.063
year[T.2014]                                     0.2383      0.002    126.963      0.000       0.235       0.242
year[T.2015]                                     0.2241      0.004     55.326      0.000       0.216       0.232
year[T.2016]                                     0.2524      0.004     57.396      0.000       0.244       0.261
year[T.2017]                                     0.3869      0.004     91.868      0.000       0.379       0.395
month[T.August]                                  0.0500      0.003     18.922      0.000       0.045       0.055
month[T.December]                                0.2555      0.003     90.043      0.000       0.250       0.261
month[T.February]                                0.0016      0.003      0.593      0.553      -0.004       0.007
month[T.January]                                 0.0417      0.003     16.091      0.000       0.037       0.047
month[T.July]                                    0.0636      0.003     25.189      0.000       0.059       0.069
month[T.June]                                    0.0308      0.003     11.929      0.000       0.026       0.036
month[T.March]                                   0.0455      0.003     17.803      0.000       0.040       0.050
month[T.May]                                    -0.0021      0.003     -0.803      0.422      -0.007       0.003
month[T.November]                                0.0633      0.003     22.764      0.000       0.058       0.069
month[T.October]                                 0.0873      0.003     32.151      0.000       0.082       0.093
month[T.September]                               0.1059      0.003     38.530      0.000       0.101       0.111
items_on_promotion                               0.0071   4.13e-05    171.654      0.000       0.007       0.007
gas_price                                       -0.0022   7.33e-05    -29.315      0.000      -0.002      -0.002
==============================================================================
Omnibus:                   694655.121   Durbin-Watson:                   2.046
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          6912581.068
Skew:                          -1.336   Prob(JB):                         0.00
Kurtosis:                      11.563   Cond. No.                     3.78e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.78e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [123]:
train['predicted_sales_volume'] = np.exp(model.predict(X_train))
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/1406944995.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['predicted_sales_volume'] = np.exp(model.predict(X_train))
In [128]:
train[['sales_volume'].plot(figsize=(16,8))
Out[128]:
<AxesSubplot:>
In [153]:
test['predicted_sales_volume'] = np.ceil(np.exp(model.predict(X_test)))
/var/folders/s3/py6b6qd51n1bxdr0pnbsjwy40000gn/T/ipykernel_98229/656582870.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predicted_sales_volume'] = np.ceil(np.exp(model.predict(X_test)))
In [169]:
sns.relplot(data=test, x='date' , y ='predicted_sales_volume', hue="product_family", markers=True, row='store_number' , kind='line', aspect = 2,
           facet_kws={'sharey': False, 'sharex': False})
Out[169]:
<seaborn.axisgrid.FacetGrid at 0x216e0ae20>
In [161]:
test.to_csv('forecast_final.csv')
In [171]:
train.to_csv('historical_final.csv')
In [163]:
data.to_csv('integrated_final.csv')